#Plant #Seedlings #Leaf #OpenCV2 #Matplotlib #Seaborn #LabelEncoder #PCA #t-SNE #EigenVector #CNN #TransferLearning #Xception #VGG #Keras #Sklearn #ClassifierComparison #XGB
Author : IndianoJuly 2020Adopted from Cross Industry Standard Process for Data Mining (CRISP-DM)

Plant Seedlings Classification V2
from IPython.display import Image
Image(filename='/home/indiano/Desktop/ren-ran-Jy6luiLBsrk-unsplash.jpg')
Classify an image of seedling into one the following 12 different seedling classes.
Can you differentiate a weed from a crop seedling?
The ability to do so effectively can mean better crop yields and better stewardship of the environment.
This is what the University of Pretoria has to say regarding maize (corn) farming in southern Africa:
Successful cultivation of maize depends largely on the efficacy of weed control. Weed control during the first six to eight weeks after planting is crucial, because weeds compete vigorously with the crop for nutrients and water during this period. Annual yield losses occur as a result of weed infestations in cultivated crops. Crop yield losses that are attributable to weeds vary with type of weed, type of crop, and the environmental conditions involved. Generally, depending on the level of weed control practiced yield losses can vary from 10 to 100 %. Rarely does one experience zero yield loss due to weeds. Yield losses occur as a result of weed interference with the crop's growth and development. This explains why effective weed control is imperative. In order to do effective control the first critical requirement is correct weed identification.
This dataset contains 5,539 images of crop and weed seedlings. The images are grouped into 12 classes. These classes represent common plant species in Danish agriculture. Each class contains rgb images that show plants at different growth stages. The images are in various sizes and are in png format.
Submissions are evaluated on MeanFScore, which at Kaggle is actually a micro-averaged F1-score.
Given positive/negative rates for each class k, the resulting score is computed this way:
$Precisionmicro=∑k∈CTPk∑k∈CTPk+FPk$
$Recallmicro=∑k∈CTPk∑k∈CTPk+FNk$
F1-score is the harmonic mean of precision and recall
$MeanFScore=F1micro= $\Frac{2PrecisionmicroRecallmicro}{Precisionmicro+Recallmicro}$
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Theano MKL Problem
import os
os.environ["MKL_THREADING_LAYER"] = "GNU"
# Formatting
import pprint as pretty
from tabulate import tabulate
# Memory usage
# import ipython_memory_usage.ipython_memory_usage as imu
# System related libraries
import os
import importlib
import joblib
from joblib import Parallel, delayed
from time import time
# Handle table-like data and matrices
import pickle
import numpy as np
import pandas as pd
# Collections
import shutil
from itertools import product, compress
from functools import reduce
from operator import itemgetter
from collections import defaultdict
from glob import glob
# Image
import imageio
import cv2
import skimage
import skimage.io
import skimage.transform
from skimage.transform import resize as imresize
# Sklearn
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from sklearn.neighbors import KernelDensity
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Modelling Algorithms
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
# Modelling Helpers
from sklearn.model_selection import train_test_split, StratifiedKFold
# Keras
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Input, Dense, Flatten, Activation, Dropout, Maximum, ZeroPadding2D, Conv2D, MaxPooling2D, LeakyReLU, concatenate
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, TensorBoard
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import xception, vgg16
from tensorflow.keras.metrics import categorical_crossentropy
# Tensorflow
import tensorflow as tf
# Auto-ML
import autosklearn
from tpot import TPOTClassifier
# Visualization
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from mpl_toolkits.axes_grid1 import ImageGrid
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
import seaborn as sns
from IPython.display import display, HTML
from tqdm import tqdm
# Configure visualisations
plt.style.use('ggplot')
sns.set_style('white')
# pylab.rcParams[ 'figure.figsize' ] = 8 , 6
%matplotlib inline
# Monitoring memory usage in jupyter notebooks; mprof run test.py & mprof plot
# %memit
%load_ext memory_profiler
from numpy.random import seed
seed(21)
# Tensorlfow
tf.random.set_seed(21)
img_size = 96
from subprocess import check_output
print(check_output(["ls", "./"]).decode("utf8"))
%ls data/all
%memit
# Same as label_to_id_dict
CLASS = {
'Black-grass': 0,
'Charlock': 1,
'Cleavers': 2,
'Common Chickweed': 3,
'Common wheat': 4,
'Fat Hen': 5,
'Loose Silky-bent': 6,
'Maize': 7,
'Scentless Mayweed': 8,
"Shepherds Purse": 9,
'Small-flowered Cranesbill': 10,
'Sugar beet': 11
}
# Same as id_to_label_dict
INV_CLASS = {
0: 'Black-grass',
1: 'Charlock',
2: 'Cleavers',
3: 'Common Chickweed',
4: 'Common wheat',
5: 'Fat Hen',
6: 'Loose Silky-bent',
7: 'Maize',
8: 'Scentless Mayweed',
9: "Shepherds Purse",
10: 'Small-flowered Cranesbill',
11: 'Sugar beet'
}
# Data directory
root_dir = './data/all'
# train_dir = os.path.join(root_dir, 'train')
# test_dir = os.path.join(root_dir, 'test')
The create_mask_for_plant function returns an image mask matrix of image_height * image_width shape. The mask matrix contains only boolean values of 0 & 1 indicating seedlings background and foreground region.
At the end, we will do morphological close operation to keep the original shape of the foreground (1 blob on the mask image) but close the spurious small holes.

# Segment Mask & Sharpen for the plant
def create_mask_for_plant(image):
# Convert to HSV image
image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
# Create mask
sensitivity = 35
lower_hsv = np.array([60 - sensitivity, 100, 50])
upper_hsv = np.array([60 + sensitivity, 255, 255])
mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11, 11))
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
return mask
def segment_plant(image):
mask = create_mask_for_plant(image)
output = cv2.bitwise_and(image, image, mask=mask)
return output
def sharpen_image(image):
image_blurred = cv2.GaussianBlur(image, (0, 0), 3)
image_sharp = cv2.addWeighted(image, 1.5, image_blurred, -0.5, 0)
return image_sharp
# Mask, Segment & Sharpen an Image
def mark_segment_sharpen_image(img, img_size=(45, 45)):
# Resize image
img = cv2.resize(img.copy(), img_size, interpolation=cv2.INTER_AREA)
image_mask = create_mask_for_plant(img)
image_segmented = segment_plant(img)
image_sharpened = sharpen_image(image_segmented)
return img, image_mask, image_segmented, image_sharpened
# Find contours and calculate the largest contour & total area
def find_contours(mask_image):
return cv2.findContours(mask_image, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)[-2]
def calculate_largest_contour_area(contours):
if len(contours) == 0:
return 0
c = max(contours, key=cv2.contourArea)
return cv2.contourArea(c)
def calculate_contours_area(contours, min_contour_area=250):
area = 0
for c in contours:
c_area = cv2.contourArea(c)
if c_area >= min_contour_area:
area += c_area
return area
# Plot the plant images on a grid according to each class label
def image_grid_plot_for_classes(labels=['Black-grass'], nb_cols=12):
fig = plt.figure(1, figsize=(nb_cols, nb_cols))
grid = ImageGrid(
fig, 111, nrows_ncols=(len(labels), nb_cols), axes_pad=0.05)
# Iterating over different class labels
for i, label in enumerate(labels):
for j in range(0, nb_cols):
axs = grid[i * nb_cols + j]
axs.imshow(cv2.resize(images_per_class[label][j], (150, 150)))
axs.axis('off')
# Class label text
axs.text(170, 75, label, verticalalignment='center')
plt.show()
# Plot the plant images on a grid according to each class label
def grid_plot_for_class(labels=['Black-grass'], nb_cols=12):
nb_rows, nb_cols = len(labels), nb_cols
fig, axs = plt.subplots(nb_rows, nb_cols, figsize=(12, 12))
# Iterating over different class labels
for i, label in enumerate(labels):
for j in range(0, nb_cols):
axs[i, j].imshow(
cv2.resize(images_per_class[label][j], (150, 150)),
aspect='auto')
axs[i, j].axis('off')
axs[i, j].text(170, 75, label, verticalalignment='center')
plt.show()
# Plot the plant images according to each class label
def plot_for_class(label, nb_rows=3, nb_cols=3):
fig, axs = plt.subplots(nb_rows, nb_cols, figsize=(12, 12))
n = 0
for i in range(0, nb_rows):
for j in range(0, nb_cols):
axs[i, j].xaxis.set_ticklabels([])
axs[i, j].yaxis.set_ticklabels([])
axs[i, j].imshow(images_per_class[label][n])
n += 1
# visualize_scatter_with_images
def visualize_scatter_with_images(X_2d_data,
images,
figsize=(45, 45),
image_zoom=1):
fig, ax = plt.subplots(figsize=figsize)
artists = []
for xy, i in zip(X_2d_data, images):
x0, y0 = xy
img = OffsetImage(i, zoom=image_zoom)
ab = AnnotationBbox(img, (x0, y0), xycoords='data', frameon=False)
artists.append(ax.add_artist(ab))
ax.update_datalim(X_2d_data)
ax.autoscale()
plt.show()
# visualize_scatter
def visualize_scatter(data_2d, label_ids, figsize=(20, 20)):
plt.figure(figsize=figsize)
plt.grid()
nb_classes = len(np.unique(label_ids))
for label_id in np.unique(label_ids):
plt.scatter(
data_2d[np.where(label_ids == label_id), 0],
data_2d[np.where(label_ids == label_id), 1],
marker='o',
color=plt.cm.Set1(label_id / float(nb_classes)),
linewidth='1',
alpha=0.8,
label=id_to_label_dict[label_id])
plt.legend(loc='best')
# visualize_scatter with 3D animation
def visualize_scatter_3D(data_3d, label_ids):
fig = plt.figure(figsize=(25, 25))
ax = fig.add_subplot(111, projection='3d')
plt.grid()
nb_classes = len(np.unique(label_ids))
for label_id in np.unique(label_ids):
ax.scatter(
data_3d[np.where(label_ids == label_id), 0],
data_3d[np.where(label_ids == label_id), 1],
data_3d[np.where(label_ids == label_id), 2],
alpha=0.8,
color=plt.cm.Set1(label_id / float(nb_classes)),
marker='o',
label=id_to_label_dict[label_id])
ax.legend(loc='best')
ax.view_init(25, 45)
ax.set_xlim(-2.5, 2.5)
ax.set_ylim(-2.5, 2.5)
ax.set_zlim(-2.5, 2.5)
# Create GIF
anima = animation.FuncAnimation(
fig,
lambda frame_number: ax.view_init(30, 4 * frame_number),
interval=175,
frames=90)
anima.save(
os.path.join('./visualization', 'seedlings_3D.gif'),
writer='imagemagick')
# Plot the missclassified plant images on a grid
def grid_plot_for_class(missclassified, nb_cols=3):
nb_rows = int(missclassified.shape[0] / nb_cols) + 1
fig = plt.figure(figsize=(15, 15))
# Iterating over different missclassified images
for index, (_, image) in enumerate(missclassified.iterrows()):
# Add a subplot. Either a 3-digit integer or three separate integers describing the position of the subplot.
# If the three integers are R, C, and P in order, the subplot will take the Pth position on
# a grid with R rows and C columns.
axs = fig.add_subplot(nb_rows, nb_cols, index + 1)
axs.text(
0, -15, INV_CLASS[image['Prediction']], horizontalalignment='left')
axs.imshow(cv2.resize(image['Image'], (150, 150)), aspect='auto')
axs.text(
75,
-15,
'True: {}'.format(INV_CLASS[image['True']]),
horizontalalignment='left')
axs.axis('off')
plt.show()
# Plot the confusion matrix
def plot_confusion_matrix(cm,
target_names,
plt_name,
rootdir='./',
save_dir='save/',
title='Confusion matrix',
cmap='Jet',
normalize=False):
"""
plot_confusion_matrix function prints & plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
:param cm:confusion matrix from sklearn.metrics.confusion_matrix
:param target_names:classification classes list eg. [0, 1] ['high', 'medium', 'low']
:param rootdir:str
:param save_dir:str
:param plt_name:str
:param title:str
:param cmap:color map list
:param normalize:bool
:return:
"""
plt_name += '_ConfusionMatrix'
if normalize:
plt_name = '{}_Normalized'.format(plt_name)
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(12, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(
j,
i,
"{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(
j,
i,
"{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\n\nAccuracy={:0.4f}; Misclassified={:0.4f}'.
format(accuracy, misclass))
print('\n\nSaving Confusion Matrices in the {} directory'.format(rootdir +
save_dir))
plt.savefig(
rootdir + save_dir + '/{}.png'.format(plt_name),
dpi=200,
format='png',
bbox_inches='tight')
plt.show()
plt.close()
Let's read the seedlings images in BGR (Blue/Green/Red) OpenCV's default format. It won't affect the segmentation in-case if you'd like to use RGB format. Anyhow, HSV (Hue/Saturation/Value) color space will be used for the processing of images.
# Flatten the plant images data after Segmentation & Masking
def load_seedlings(root_dir, init_img_size=(150, 150),
final_img_size=(45, 45)):
images = []
labels = []
images_per_class = defaultdict(list)
for class_folder_name in os.listdir(root_dir):
class_folder_path = os.path.join(root_dir, class_folder_name)
for image_path in glob(os.path.join(class_folder_path, "*.png")):
# Read an image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
# Stats for each seedlings class
images_per_class[class_folder_name].append(image)
# Resize image to default init_img_size pixels 150*150
image = cv2.resize(image, init_img_size, interpolation=cv2.INTER_AREA)
# Segementation
image = segment_plant(image)
# BGR2GRAY conversion
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Resize image to default final_img_size pixels 45*45
if (init_img_size != final_img_size):
image = cv2.resize(
image, final_img_size, interpolation=cv2.INTER_AREA)
image = image.flatten()
images.append(image)
labels.append(class_folder_name)
images = np.array(images)
labels = np.array(labels)
return images, labels, images_per_class
# Load Images, Labels & images_per_class dict
images, labels, images_per_class = np.array([]), np.array([]), np.array([])
try:
print('Loading Images, Labels & images_per_class dict ...')
images = joblib.load(os.path.join('./save/pickles', 'images.pickle'))
labels = joblib.load(os.path.join('./save/pickles', 'labels.pickle'))
images_per_class = joblib.load(
os.path.join('./save/pickles', 'images_per_class.pickle'))
print('Loading done.')
except Exception as e:
print('Stacktrace', e)
print('Images, Labels or images_per_class dict does not exist.')
if not (images.size or labels.size) or (not images_per_class):
print('Running load_seedlings method ...')
images, labels, images_per_class = load_seedlings(root_dir, final_img_size=(96, 96))
print('Dumping Images, Labels & images_per_class dict ...')
if not os.path.exists('./save/pickles'):
os.makedirs('./save/pickles')
# Dumping dicts
joblib.dump(images, os.path.join('./save/pickles', 'images.pickle'))
joblib.dump(labels, os.path.join('./save/pickles', 'labels.pickle'))
joblib.dump(images_per_class,
os.path.join('./save/pickles', 'images_per_class.pickle'))
print('Images, Labels or images_per_class dict dumping done.')
for key, value in images_per_class.items():
print("{0} -> {1}".format(key, len(value)))
# Resize all image to 96*96
def img_reshape(img):
img = imresize(img, (img_size, img_size, 3))
return img
# get image tag
def img_label(path):
return str(str(path.split('/')[-1]))
# get plant class on image
def img_class(path):
return str(path.split('/')[-2])
# fill train and test dict
def fill_dict(paths, some_dict):
for p in tqdm(paths, ascii=True, desc="loading"):
print(p)
img = imageio.imread(p)
img = img_reshape(img)
some_dict['image'].append(img)
some_dict['label'].append(img_class(p))
return some_dict
# read image from dir. and fill train and test dict
def reader():
file_ext = []
train_path = []
test_path = []
for root, dirs, files in os.walk('./data/test'):
if dirs != []:
print('Root:\n' + str(root))
print('Dirs:\n' + str(dirs))
else:
for f in files:
ext = os.path.splitext(str(f))[1][1:]
if ext not in file_ext:
file_ext.append(ext)
# if 'train' in root:
path = os.path.join(root, f)
train_path.append(path)
# elif 'test' in root:
# path = os.path.join(root, f)
# test_path.append(path)
train_dict = {'image': [], 'label': []}
# test_dict = {'image': [], 'label': []}
train_dict = fill_dict(train_path, train_dict)
# test_dict = fill_dict(test_path, test_dict)
return train_dict
test_dict = reader()
test_dict_df = pd.DataFrame.from_dict(test_dict)
test_dict_df.head()
To understand the data we are now going to consider some key facts about seedlings class and count.
# Dataframe containing seedlings class and count
images_per_class_df = pd.DataFrame({
'Class': list(images_per_class.keys()),
'Count': [len(value) for value in images_per_class.values()]
})
images_per_class_df.head()
# Plot seedlings by class and count
seaborn = sns.FacetGrid(images_per_class_df, size=5, aspect=2)
seaborn.map(sns.barplot, 'Class', 'Count', palette='deep')
seaborn.set_xticklabels(rotation=30)
seaborn.add_legend()
Plot an image grid of Small-flowered Cranesbill plant seedlings
plot_for_class("Small-flowered Cranesbill", nb_rows=10, nb_cols=10)
Plot an image grid of Maize plant seedlings
plot_for_class("Maize", nb_rows=10, nb_cols=10)
Plot an image grid of All seedling
image_grid_plot_for_classes(list(images_per_class.keys()), nb_cols=12)
grid_plot_for_class(list(images_per_class.keys()), nb_cols=12)
After statistics analysis & visualizing the different aspects of images, we will sanitize or clean the training data for our model.
As we can see that each image has a background which makes contour of plant leaves obscure or unclear. Hence we will segregate background from foreground and hope it might help us achieve better accuracy.
For removing the background, we'll use the fact that all plant leaves are green and we can create a mask to remove background.
For creating mask, which will remove background, we need to convert BGR/RGB image to HSV. HSV is alternative of the BGR/RGB color model. In HSV, it is easier to represent a color range than in BGR/RGB color space.
Being a simple object detection problem, we will use the color of the object for background segregation. The HSV color-space is suitable for color detection because we can define a color with Hue and it's variations or spectrum using Saturation & Value. eg. Red, Darker Red, Lighter Red.
The following figure illustrates the HSV color space.

As hue varies from 0 to 1.0, the corresponding colors vary from red through yellow, green, cyan, blue, magenta, and back to red, so that there are actually red values both at 0 and 1.0. As saturation varies from 0 to 1.0, the corresponding colors (hues) vary from unsaturated (shades of gray) to fully saturated (no white component). As value, or brightness, varies from 0 to 1.0, the corresponding colors become increasingly brighter.
# Plot the plant images according to each class label
def plot_masked_image(images, nb_cols=4):
# Generate figure, axes
fig, axes = plt.subplots(
nrows=len(images), ncols=nb_cols, figsize=(12, 12))
# Iterating over different trials & parameters
for index, image in enumerate(images):
# Mask, Segment & Sharpen
image_mask = create_mask_for_plant(image)
image_segmented = segment_plant(image)
image_sharpen = sharpen_image(image_segmented)
# Show images
axes[index, 0].imshow(image)
axes[index, 1].imshow(image_mask)
axes[index, 2].imshow(image_segmented)
axes[index, 3].imshow(image_sharpen)
# Set x, y tick labels
axes[index, 0].xaxis.set_ticklabels([])
axes[index, 0].yaxis.set_ticklabels([])
axes[index, 1].xaxis.set_ticklabels([])
axes[index, 1].yaxis.set_ticklabels([])
axes[index, 2].xaxis.set_ticklabels([])
axes[index, 2].yaxis.set_ticklabels([])
axes[index, 3].xaxis.set_ticklabels([])
axes[index, 3].yaxis.set_ticklabels([])
plt.show()
plot_masked_image(images_per_class["Small-flowered Cranesbill"][97:101])
# Test image to see the changes
image = images_per_class["Small-flowered Cranesbill"][97]
image_mask = create_mask_for_plant(image)
image_segmented = segment_plant(image)
image_sharpened = sharpen_image(image_segmented)
# Segment Mask & Sharpen for the plant using mark_segment_sharpen_image method
# image, image_mask, image_segmented, image_sharpened = mark_segment_sharpen_image(image, img_size=(150, 150))
fig, axs = plt.subplots(1, 4, figsize=(20, 20))
axs[0].imshow(image)
axs[1].imshow(image_mask)
axs[2].imshow(image_segmented)
axs[3].imshow(image_sharpened)
Let's create a dictionary and reverse dictionary containing mapping from label to id & vice-versa.
# Create a mapping from a flower class to an unique integer id & vice-versa
label_to_id_dict = {v: i for i, v in enumerate(np.unique(labels))}
id_to_label_dict = {v: k for k, v in label_to_id_dict.items()}
# Covert train images label into id
label_ids = np.array([label_to_id_dict[x] for x in labels])
# Plot of label types numbers
pd.Series(labels).value_counts().plot(
kind='pie', title='Labels Distribution', figsize=(5, 5))
plt.show()
# Encode labels and create classes
le = LabelEncoder()
le.fit(labels)
labels_encoded = le.transform(labels)
print("\n\nClasses: ", le.classes_)
# Convert labels into categorical values
labels_onehot = to_categorical(labels_encoded)
print("\nNumber of One Hot encoded class labels: ", labels_onehot.shape[1])
Now we are ready to train a model and predict. There are loads of predictive modelling algorithms to choose from. We must understand the type of problem and solution requirements to narrow down to a select few baseline models which we can evaluate iteratively.
Our problem falls in the supervised classification category. We want to identify the relationship between a seedling class and an input image.
With above two criteria - Supervised Learning plus Image Classification, we can narrow down our choice of models to a few which are as follow:
The convolutional Neural Networks has the following strutctures.
Each pair of convolution layers have a max pooling layer and uses batch normalization to reduce overfitting. It also takes advantage of Leaky ReLUs. In dense layers, we will use dropout to reduce overfitting.
In the end, the fully-connected layers will be used for classification which the outputs probability distribution for each of 12 Seedling classes.
Batch normalization is a technique to provide any layer in a neural network with inputs that are zero mean/unit variance. It is used to normalize the input layer by adjusting and scaling the activations.
Leaky ReLUs allow a small, non-zero gradient when the unit is not active.
Parametric ReLUs take this idea further by making the coefficient of leakage into a parameter that is learned along with the other neural network parameters.
BATCH_SIZE = 16
EPOCHS = 16
RANDOM_STATE = 21
# Same as label_to_id_dict
CLASS = {
'Black-grass': 0,
'Charlock': 1,
'Cleavers': 2,
'Common Chickweed': 3,
'Common wheat': 4,
'Fat Hen': 5,
'Loose Silky-bent': 6,
'Maize': 7,
'Scentless Mayweed': 8,
'Shepherds Purse': 9,
'Small-flowered Cranesbill': 10,
'Sugar beet': 11
}
# Same as id_to_label_dict
INV_CLASS = {
0: 'Black-grass',
1: 'Charlock',
2: 'Cleavers',
3: 'Common Chickweed',
4: 'Common wheat',
5: 'Fat Hen',
6: 'Loose Silky-bent',
7: 'Maize',
8: 'Scentless Mayweed',
9: 'Shepherds Purse',
10: 'Small-flowered Cranesbill',
11: 'Sugar beet'
}
# Define Model Neural Network
def model_neural_network():
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128
dropout_conv = 0.3
dropout_dense = 0.3
model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu',
input_shape = (img_size, img_size, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))
model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(12, activation = "softmax"))
# The first 50 epochs with Adam opt & then 30 epochs with SGD opt.
#mypotim = Adam(lr=2 * 1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
myopotim = Adam(lr=0.0001)
mypotim = SGD(lr=1 * 1e-1, momentum=0.9, nesterov=True)
model.compile(
loss='categorical_crossentropy',
optimizer=mypotim,
metrics=['accuracy'])
model.summary()
return model
# Model Callbacks
def get_callbacks(checkpoint_path, patience=5):
if not os.path.exists('./logs'):
os.makedirs('./logs')
lr_reduced = ReduceLROnPlateau(
monitor='val_acc',
factor=0.1,
patience=patience,
verbose=1)
earlystopper = EarlyStopping(monitor='loss', patience=10, verbose=1)
checkpoint = ModelCheckpoint(
checkpoint_path, monitor='val_acc', save_best_only=True, verbose=1)
#early_stop = EarlyStopping(monitor='val_acc', patience=patience, mode='max')
tensorboard = TensorBoard(
log_dir='./logs',
histogram_freq=2000,
write_graph=True,
write_images=True)
return [lr_reduced, checkpoint, tensorboard]
# Define Train method
def train(img, label, best_model=""):
# Setting checkpoint & early stopping
# Load the last best checkpoint files
callbacks_list = get_callbacks(
checkpoint_path=os.path.join(
'./checkpoint', 'model_weight_SGD.{epoch:02d}.hdf5'),
patience=5)
# Initialize the model neural networks architecture
model = model_neural_network()
best_model="-"
# Load network weights
if os.path.exists(os.path.join('./checkpoint', best_model)):
print('Loading weights from previous run...', best_model)
model.load_weights(filepath=os.path.join('./checkpoint', best_model))
else:
print('Weights of the Best model not loaded.')
print('Training from scratch ...')
# Split the data in train & validation set
x_train, x_valid, y_train, y_valid = train_test_split(
img,
label,
shuffle=True,
train_size=0.7,
stratify=label,
random_state=RANDOM_STATE)
# To avoide overfitting, we need to create an image generator which will randomly rotate, zoom, shift and flip image
# during the fitting of the model.
# Define an ImageDataGenerator
train_gen = ImageDataGenerator(
# rescale=1.0/255
rotation_range=360.,
width_shift_range=0.3,
height_shift_range=0.3,
zoom_range=0.3,
horizontal_flip=True,
vertical_flip=True)
val_gen = ImageDataGenerator(
# rescale=1.0/255
)
test_gen = ImageDataGenerator(
# rescale=1.0/255
)
# Fitting the model with the image generator & others
model.fit(
train_gen.flow(
x_train, y_train, batch_size=BATCH_SIZE),
steps_per_epoch=np.ceil(len(x_train) / BATCH_SIZE),
epochs=EPOCHS,
verbose=1,
shuffle=True,
validation_data=val_gen.flow(
x_valid, y_valid, batch_size=BATCH_SIZE),
validation_steps=np.ceil(len(x_valid) / BATCH_SIZE),
callbacks=callbacks_list)
# Training the model
def train_main():
prepare_data = False
if not prepare_data:
try:
train_dict = joblib.load(
os.path.join('./save/pickles', 'train_dict.pickle'))
# test_dict = joblib.load(
# os.path.join('./save/pickles', 'test_dict.pickle'))
print('Loading train & test dict...')
except Exception as e:
print('Stacktrace', e)
print('Train & Test Dict does not exist.')
else:
train_dict = reader()
# Dumping dicts
joblib.dump(train_dict, os.path.join('./save/pickles', 'train_dict.pickle'))
X_train = np.array(train_dict['image'])
y_train = to_categorical(np.array([CLASS[str(l)] for l in train_dict['label']]))
# best_model = 'model_weight_SGD.05-0.9526.hdf5'
train(X_train, y_train, best_model="")
if __name__ == '__main__':
train_main()
# Define Test method
def test_valid(X_train, x_valid, y_train, y_valid, best_model):
# Initialize the model neural networks architecture & load weights
model = model_neural_network()
model.load_weights(filepath=os.path.join("./checkpoint", best_model))
# Evaluate on train and valid set
print("\nCalculating Train Set accuracy ...")
train_acc = model.evaluate(
X_train, to_categorical(np.array([CLASS[l] for l in y_train]))
)
print(
"Train Set accuracy: {}%".format([np.round(acc * 100, 2) for acc in train_acc])
)
print("\nCalculating Valid Set accuracy ...")
valid_acc = model.evaluate(
x_valid, to_categorical(np.array([CLASS[l] for l in y_valid]))
)
print(
"Valid Set accuracy: {}%".format([np.round(acc * 100, 2) for acc in valid_acc])
)
print("\nCalculating Confusion Matrix & Classification Report ...")
# Predictions on the valid set
prob = model.predict(x_valid, verbose=1)
predictions = prob.argmax(axis=1)
# True class label
true = np.array([CLASS[l] for l in y_valid])
# Confusion matrix on the valid set.
con_matrix = confusion_matrix(true, predictions)
print("\n\nConfusion matrix of the Seedlings Valid set\n\n")
display(
pd.DataFrame(con_matrix, columns=list(CLASS.keys()), index=list(CLASS.keys()))
)
# Plot the confusion matrix
plot_confusion_matrix(
con_matrix,
target_names=list(CLASS.keys()),
plt_name="Seedlings_Con_Matrix",
rootdir="./",
save_dir="save/con_matrix",
title="Confusion matrix",
cmap=plt.cm.Blues,
normalize=False,
)
print("\n\nClassification Report")
print(classification_report(true, predictions, target_names=list(CLASS.keys())))
# Testing the model on validation set
def valid_main():
try:
train_dict = joblib.load(os.path.join("./save/pickles", "train_dict.pickle"))
# test_dict = joblib.load(
# os.path.join('./save/pickles', 'test_dict.pickle'))
print("Loading train & test dict...")
except Exception as e:
print("Stacktrace", e)
print("Train & Test Dict does not exist.")
if not train_dict:
train_dict = reader()
# Dumping dicts
joblib.dump(train_dict, os.path.join("./save/pickles", "train_dict.pickle"))
# joblib.dump(test_dict,
# os.path.join('./save/pickles', 'test_dict.pickle'))
img = np.array(train_dict["image"])
label = train_dict["label"]
# Split the data in train & validation set
X_train, x_valid, y_train, y_valid = train_test_split(
img,
label,
shuffle=True,
train_size=0.7,
stratify=target,
random_state=RANDOM_STATE,
)
# best_model = input('Enter the name of best model saved in ./logs dir: ')
best_model = "model_weight_SGD.12-0.9832.hdf5"
test_valid(X_train, x_valid, y_train, y_valid, best_model)
if __name__ == "__main__":
valid_main()
# Define Test method
def test(img, label, best_model):
# Initialize the model neural networks architecture & load weights
model = model_neural_network()
model.load_weights(filepath=os.path.join('./checkpoint', best_model))
# Predictions
prob = model.predict(img, verbose=1)
pred = prob.argmax(axis=-1)
submission_df = pd.DataFrame({
"file": label,
"species": [INV_CLASS[p] for p in pred]
})
submission_df.to_csv(
os.path.join('./save', 'submission.csv'), index=False, header=True)
# Testing the model
def test_main():
try:
test_dict = joblib.load(
os.path.join('./save/pickles', 'test_dict.pickle'))
print('Loading dict...')
except Exception as e:
print('Stacktrace', e)
print('Test Dict does not exist.')
if not test_dict:
_, test_dict = reader()
X_test = np.array(test_dict['image'])
label = test_dict['label']
best_model = input('Enter the name of best model saved in ./logs dir: ')
test(X_test, label, best_model)
if __name__ == '__main__':
test_main()
Use Keras Pretrained Models
%ls ../../models/keras-pretrained-models
# Load test seedlings data & return images, labels, image_per_class
def load_seedlings_transfer(path=os.path.join(root_dir, 'train'),
img_size=(299, 299)):
images = []
labels = []
images_per_class = defaultdict(list)
for class_folder_name in os.listdir(path):
class_folder_path = os.path.join(path, class_folder_name)
for image_path in glob(os.path.join(class_folder_path, "*.png")):
# Read an image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
# Stats for each seedlings class
images_per_class[class_folder_name].append(image)
# Resize image to img_size pixels
image = cv2.resize(image, img_size, interpolation=cv2.INTER_AREA)
images.append(image)
labels.append(class_folder_name)
images = np.array(images)
labels = np.array(labels)
return images, labels, images_per_class
# Load test seedlings data & return images, ids
def load_seedlings_transfer_test(path=os.path.join(root_dir, 'test'),
img_size=(299, 299)):
images = []
ids = []
for image_path in glob(os.path.join(path, "*.png")):
# Read an image
image = cv2.imread(image_path, cv2.IMREAD_COLOR)
# Resize image to img_size pixels
image = cv2.resize(image, img_size, interpolation=cv2.INTER_AREA)
images.append(image)
ids.append(os.path.basename(image_path))
images = np.array(images)
ids = np.array(ids)
return images, ids
ls data/all/
The Xception model input dimesions are 299 * 299 * 3. Hence we need to prepare images satisfying the dimensional requirements.
# Constants
input_dim = 299
POOLING = 'avg'
RANDOM_STATE = 21
images_xception = np.array([])
try:
print('Loading Images & Labels dict ...')
images_xception = joblib.load(os.path.join(
'./save/pickles', 'images-xception.pickle'))
labels = joblib.load(os.path.join(
'./save/pickles', 'labels-xception.pickle'))
print('Loading done.')
except Exception as e:
print('Stacktrace', e)
print('Images & Labels dict does not exist.')
if not (images_xception.size):
img_size = (299, 299)
print('Running load_seedlings method with img_size {} ...'.format(img_size))
images_xception, labels, images_per_class = load_seedlings_transfer(
img_size=img_size)
print('Dumping Images & Labels dict ...')
# Dumping dicts
joblib.dump(images_xception, os.path.join(
'./save/pickles', 'images-xception.pickle'))
joblib.dump(labels, os.path.join(
'./save/pickles', 'labels-xception.pickle'))
print('Images & Labels dict dumping done.')
# Type cast int to float for Xception model compatibility
images_xception = images_xception.astype(dtype='float32', copy=False)
print('Images Shape:', images_xception.shape)
print('Image Data Type:', images_xception[0].dtype)
print('Labels Shape:', labels.shape)
print('Label Data Type:', labels[0].dtype)
%time
# Numpy array to store extracted features from Xception model
images_xception_features = np.zeros(
(len(images_xception), input_dim, input_dim, 3), dtype='float32')
# Extract features with Xception model
for i, image in tqdm(enumerate(images_xception)):
x = xception.preprocess_input(np.expand_dims(image, axis=0))
images_xception_features[i] = x
print('Train Images shape: {} size: {:,}'.format(
images_xception_features.shape, images_xception_features.size))
print('Dumping images_xception_features ...')
joblib.dump(images_xception_features, os.path.join(
'./save/pickles', 'images_xception_features.pickle'))
print('images_xception_features dumping done.')
del images_xception_features, labels
try:
print('Loading images_xception_features & Labels ...')
images_xception_features = joblib.load(os.path.join(
'./save/pickles', 'images_xception_features.pickle'))
labels = joblib.load(os.path.join(
'./save/pickles', 'labels-xception.pickle'))
print('Loading done.')
except Exception as e:
print('Stacktrace', e)
print('images_xception_features & Labels does not exist.')
# Split the data in train & validation set
x_train, x_valid, y_train, y_valid = train_test_split(
images_xception_features,
labels,
shuffle=True,
train_size=0.8,
stratify=labels,
random_state=RANDOM_STATE
)
del images_xception_features, labels
# Xception V1 model, with weights pre-trained on ImageNet. On ImageNet, this model gets to
# a top-1 validation accuracy of 0.790 and a top-5 validation accuracy of 0.945. Note that this
# model only supports the data format 'channels_last' (height, width, channels).
# The default input size for this model is 299x299.
xception_bottleneck = xception.Xception(
weights='imagenet', include_top=False, pooling=POOLING)
# Calculating Xception train & valid set bottleneck features
x_train_botf = xception_bottleneck.predict(x_train, batch_size=32, verbose=1)
x_valid_botf = xception_bottleneck.predict(x_valid, batch_size=32, verbose=1)
print('Xception train bottleneck features shape: {} size: {:,}'.format(
x_train_botf.shape, x_train_botf.size))
print('Xception valid bottleneck features shape: {} size: {:,}'.format(
x_valid_botf.shape, x_valid_botf.size))
# Dumping Xception train & valid set bottleneck features
joblib.dump(x_train_botf, os.path.join(
'./save/pickles', 'x_train_botf.pickle'))
joblib.dump(x_valid_botf, os.path.join(
'./save/pickles', 'x_valid_botf.pickle'))
joblib.dump(y_train, os.path.join('./save/pickles', 'y_train_botf.pickle'))
joblib.dump(y_valid, os.path.join('./save/pickles', 'y_valid_botf.pickle'))
print('Dumping Xception train & valid set bottleneck features & labels done.')
# Logistic Regression classifier
logreg = LogisticRegression(
multi_class='multinomial', solver='lbfgs', random_state=RANDOM_STATE)
logreg.fit(x_train_botf, y_train)
valid_probs = logreg.predict_proba(x_valid_botf)
valid_preds = logreg.predict(x_valid_botf)
print('Logistic Regression model Accuracy on valid set using Xception Bottleneck Features: {}%'.format(
np.round(accuracy_score(y_valid, valid_preds)*100, 2)))
Confusion Matrix
con_matrix = confusion_matrix(y_valid, valid_preds)
# Plot confusion matrix using seaborn heatmap
fig, ax = plt.subplots(1, figsize=(15, 15))
ax = sns.heatmap(con_matrix, ax=ax, cmap=plt.cm.Oranges, annot=True)
ax.set_xticklabels(list(CLASS.keys()), rotation=45)
ax.set_yticklabels(list(CLASS.keys()), rotation=45)
plt.xlabel('Predicted class')
plt.ylabel('True class')
plt.title('Confusion Matrix')
fig.savefig(os.path.join('./save/con_matrix',
'Seedlings_logreg_Confusion_Matrix.png'), dpi=300)
plt.show()
After fitting our train dataset on Xception Bottleneck Features, you can run a general Classifier Comparison module containing sklearn classifers with vanilla configurations.
% % time
def classifier_comparison(n_splits=3):
# Classifier names & initializations
names = ["Nearest Neighbors", "SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "GradientBoost"
"Naive Bayes", "LDA", "QDA", "XGB Classifier"]
classifiers = [
KNeighborsClassifier(3),
SVC(probability=True),
GaussianProcessClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
MLPClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
XGBClassifier()]
# Split & shuffle data using Stratified K Fold
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
# Calculate accuracy over all splits, classifiers
count = 0
acc_dict = defaultdict(list)
for train_index, test_index in kfold.split(x_train_botf, y_train):
xtrain, ytrain = x_train_botf[train_index], y_train[train_index]
xtest, ytest = x_train_botf[test_index], y_train[test_index]
print('Iteration: ', count)
# iterate over classifiers
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(xtrain, ytrain)
predictions = clf.predict(xtest)
accuracy = accuracy_score(ytest, predictions)
# Append results in default dict of lists
acc_dict[clf_name].append(accuracy)
count += 1
return acc_dict
# Call the classifier_comparison module & save the results
acc_dict = classifier_comparison(n_splits=3)
# Dataframe containing classifier name & accuracy score
acc_df = pd.DataFrame.from_dict(acc_dict, orient='index')
acc_df = acc_df.rename(
columns={0: '1st Iteration', 1: '2nd Iteration', 2: '3rd Iteration'})
# Calculate average accuracy
acc_df['avg_acc'] = acc_df.mean(axis=1)
acc_df = (acc_df*100).round(2)
sorted_acc_df = acc_df.sort_values(by='avg_acc', ascending=False)
display(sorted_acc_df)
# Plot a barplot containing classifier name & accuracy score
sns.barplot(x='avg_acc', y=sorted_acc_df.index, data=sorted_acc_df)
plt.xlabel('Average Accuracy')
plt.title('Classifier Comparison based on Accuracy')
It seems like simple neural network MLP performs much better than others. Hence we can train a full blown Fully Connected networks in keras.
# Dumping Xception train & valid set bottleneck features
x_train_botf = joblib.load(os.path.join(
'./save/pickles', 'x_train_botf.pickle'))
x_valid_botf = joblib.load(os.path.join(
'./save/pickles', 'x_valid_botf.pickle'))
y_train_botf = joblib.load(os.path.join(
'./save/pickles', 'y_train_botf.pickle'))
y_valid_botf = joblib.load(os.path.join(
'./save/pickles', 'y_valid_botf.pickle'))
print('Loading Xception train & valid set bottleneck features & labels done.')
% % time
BATCH_SIZE = 16
EPOCHS = 500
RANDOM_STATE = 21
# Fixing random seed
np.random.seed(RANDOM_STATE)
# Reduce LR On Plateau
lr_reduced = ReduceLROnPlateau(
monitor='val_acc', factor=0.1, epsilon=1e-5, patience=5, verbose=1)
# Setting checkpoint & early stopping
checkpoint_path = './checkpoint/Xception_ADAM_best_weights.{epoch:02d}-{acc:.4f}.hdf5'
checkpoint = ModelCheckpoint(
checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# TensorBoard logs
tensorboard = TensorBoard(log_dir='./logs{}'.format(time()),
histogram_freq=2000, write_graph=True, write_images=True)
# Early Stopping
# early_stop = EarlyStopping(monitor='val_acc', patience=5, mode='max')
# All callbacks
callbacks_list = [lr_reduced, checkpoint, tensorboard]
# Model Neural Network Architecture
inputs = Input(shape=(2048,))
# Adding 1st Fully connected, output layer & Dropout
fc = Dense(input_shape=(2048,), units=2048, activation='relu')(inputs)
fc = Dropout(0.5)(fc)
# Adding 2nd Fully connected, output layer & Dropout
fc = Dense(input_shape=(1024,), units=1024, activation='relu')(fc)
fc = Dropout(0.4)(fc)
# Adding 3rd Fully connected, output layer & Dropout
fc = Dense(input_shape=(512,), units=512, activation='relu')(fc)
fc = Dropout(0.3)(fc)
# Adding 4th Fully connected, output layer & Dropout
fc = Dense(input_shape=(256,), units=256, activation='relu')(fc)
fc = Dropout(0.2)(fc)
# Adding 5th Fully connected, output layer & Dropout
fc = Dense(input_shape=(128,), units=128, activation='relu')(fc)
fc = Dropout(0.1)(fc)
# Adding output layer
outputs = Dense(units=12, activation='softmax')(fc)
model = Model(inputs=inputs, outputs=outputs)
best_model = 'Xception_ADAM_best_weights.41-0.9428.hdf5'
# Load network weights
if os.path.exists(os.path.join('./checkpoint', best_model)):
print('Loading weights from previous run...', best_model)
model.load_weights(filepath=os.path.join('./checkpoint', best_model))
else:
print('Weights not loaded.', best_model)
mypotim = Adam(lr=2 * 1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# mypotim = SGD(lr=1 * 1e-1, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer=mypotim,
metrics=['accuracy'])
# Model Summary
model.summary()
# Convert categorical labels into one-hot vector
y_train_botf = to_categorical(np.array([CLASS[l] for l in y_train_botf]))
print('The Best Model will be saved in the following dir ', checkpoint_path)
print('Training of the model with Xception bottleneck features as inputs starts now ...')
# Fit the model
model.fit(x_train_botf, y_train_botf, validation_split=0.3, batch_size=BATCH_SIZE,
epochs=EPOCHS, verbose=1, shuffle=True, callbacks=callbacks_list)
display(features_df.head())
# Select the numeric features, drop NA values
x_train_features =` features_df.loc[:, [
'height', 'width', 'largest_contour_area', 'label']]
x_train_features['area_per_contour'] = features_df.loc[:,
'area'] / features_df.loc[:, 'no_of_countours']
print('Displaying Null rows/columns ...')
display(x_train_features[x_train_features.isnull().any(axis=1)])
x_train_features.dropna(inplace=True)
x_train_features.reset_index(drop=True, inplace=True)
# Pop out labels
labels = x_train_features.pop('label')
# Scale using MinMaxScaler
x_train_features_scaled = MinMaxScaler().fit_transform(x_train_features)
x_train_features.loc[:] = x_train_features_scaled
print('\nTop 10 rows')
x_train_features.head()
# Encode labels and create classes
le = LabelEncoder()
le.fit(labels)
labels_encoded = le.transform(labels)
print("\n\nClasses: ", le.classes_)
# Convert labels into categorical values
labels_onehot = to_categorical(labels_encoded)
print("\nNumber of One Hot encoded class labels: ", labels_onehot.shape[1])
% % time
x_train_features = np.array(x_train_features)
labels_encoded = np.array(labels_encoded)
def classifier_comparison(n_splits=3):
# Classifier names & initializations
names = ["Nearest Neighbors", "SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "GradientBoost"
"Naive Bayes", "LDA", "QDA", "XGB Classifier"]
classifiers = [
KNeighborsClassifier(3),
SVC(probability=True),
GaussianProcessClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
MLPClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
XGBClassifier()]
# Split & shuffle data using Stratified K Fold
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
# Calculate accuracy over all splits, classifiers
count = 0
acc_dict = defaultdict(list)
for train_index, test_index in kfold.split(x_train_features, labels_encoded):
xtrain, ytrain = x_train_features[train_index], labels_encoded[train_index]
xtest, ytest = x_train_features[test_index], labels_encoded[test_index]
print('Iteration: ', count)
# iterate over classifiers
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(xtrain, ytrain)
predictions = clf.predict(xtest)
accuracy = accuracy_score(ytest, predictions)
# Append results in default dict of lists
acc_dict[clf_name].append(accuracy)
count += 1
return acc_dict
# Call the classifier_comparison module & save the results
acc_dict = classifier_comparison(n_splits=3)
# Dataframe containing classifier name & accuracy score
acc_df = pd.DataFrame.from_dict(acc_dict, orient='index')
acc_df = acc_df.rename(
columns={0: '1st Iteration', 1: '2nd Iteration', 2: '3rd Iteration'})
# Calculate average accuracy
acc_df['avg_acc'] = acc_df.mean(axis=1)
acc_df = (acc_df*100).round(2)
sorted_acc_df = acc_df.sort_values(by='avg_acc', ascending=False)
display(sorted_acc_df)
# Plot a barplot containing classifier name & accuracy score
sns.barplot(x='avg_acc', y=sorted_acc_df.index, data=sorted_acc_df)
plt.xlabel('Average Accuracy')
plt.title('Classifier Comparison based on Accuracy')
It seems like simple XGB performs much better than others but accuracy is very less in comparison to a full blown CNN networks.
This conversion needs to be done so that the model can be loaded into the web app.
!pip install tensorflowjs
# Use the command line conversion tool to convert the model
!tensorflowjs_converter --input_format keras model.h5 tfjs_model/model
It's now time for you to get your hands even dirtier and go at it all by yourself in a challenge!